import os
import pandas as pd
import csv
import numpy as np
import math
from matplotlib.lines import Line2D
import matplotlib as mpl
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance
from scipy import stats
from sklearn import manifold
from sklearn.decomposition import PCA
import statsmodels.stats.multitest as sm

Summary

This file includes the results of a microbiome analysis performed on samples taken from four individuals that were originally used to determine the “Impact of DNA source on genetic variant detection from human whole-genome sequencing data”.

This included blood, saliva and buccal samples taken from four individuals (blood samples were taken at a different time than saliva and buccal samples). Additionally, a methylation-based enrichment for eukaryotic DNA was performed on the saliva and buccal samples.

Sections not completed:
- Absolute number of reads between different reference databases
- The intersection between different reference databases
- Functional profiling using HUMAnN2 (if I want to look more into the families that are differentially abundant)
- HUMAnN3 taxonomy
- Functional profiling using HUMAnN3
- Functional profiling using MEGAHIT and anvi’o
- Some kind of functional richness?

Basic processing steps

Obtaining reads

Fastq.gz files were downloaded from the ENA database, project accession number PRJNA523344

Kneaddata

Kneaddata was used for quality control and removal of human sequences. This included:
- Trimmomatic 0.39: “SLIDINGWINDOW:4:20 MINLEN:50”
- Bowtie2 with the GRCh38_PhiX database (to remove human and PhiX reads): “–fast –dovetail”

parallel -j 2 --link 'kneaddata -i {1} -i {2} -o kneaddata_out/ \
-db /home/shared/bowtiedb/GRCh38_PhiX --trimmomatic /home/robyn/Trimmomatic-0.39/ \
-t 40 --trimmomatic-options "SLIDINGWINDOW:4:20 MINLEN:50" \
--bowtie2-options "--fast --dovetail" --remove-intermediate-output' \
 ::: raw_data/*_1.fastq.gz ::: raw_data/*_2.fastq.gz
 
 mkdir kneaddata_out/contam_seq
mv kneaddata_out/*_contam*.fastq kneaddata_out/contam_seq
kneaddata_read_count_table --input kneaddata_out --output kneaddata_read_counts.txt
concat_paired_end.pl -p 4 --no_R_match -o cat_reads kneaddata_out/*_paired_*.fastq 

Reads kept after Kneaddata

#3 
#set up colors function (to get up to 120 colors, but with up to 40 unique colors)
def get_cols(num):
    colormap_20, colormap_40b, colormap_40c = mpl.cm.get_cmap('tab20', 256), mpl.cm.get_cmap('tab20b', 256), mpl.cm.get_cmap('tab20c', 256)
    norm, norm2 = mpl.colors.Normalize(vmin=0, vmax=19), mpl.colors.Normalize(vmin=20, vmax=39)
    m1, m2, m3 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap_20), mpl.cm.ScalarMappable(norm=norm, cmap=colormap_40b), mpl.cm.ScalarMappable(norm=norm2, cmap=colormap_40c)
    colors_20 = [m1.to_rgba(a) for a in range(20)]
    colors_40 = [m2.to_rgba(a) for a in range(20)]+[m3.to_rgba(a) for a in range(20,40)]
    if num < 21: return colors_20
    elif num < 41: return colors_40
    else: return colors_40+colors_40+colors_40
#and colors and shapes for different participants and body sites
colors_dict, shapes_dict = {'Blood':'#900C3F', 'Saliva':'#016F85', 'Buccal':'#ff8300', 'Saliva_euk':'#02aed1', 'Buccal_euk':'#FFC300'}, {'Huref':'o', 'PGPC-0002':'^', 'PGPC-0005':'*', 'PGPC-0006':'s', 'PGPC-0050':'p'}
#4
#get numbers of reads for different steps
reads = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/read_counts.txt', sep='\t', index_col=3, header=0)
participant_dict, site_dict, full_name_dict = {}, {}, {}
samples = list(reads.index.values)
for s in samples:
    participant_dict[s] = reads.loc[s, 'Participant']
    site_dict[s] = reads.loc[s, 'Body site']
    full_name_dict[s] = reads.loc[s, 'Participant']+' '+reads.loc[s, 'Body site']
total_reads = pd.DataFrame(reads.loc[:, 'cat_reads'])
sample_names = [participant_dict[name]+' '+site_dict[name] for name in samples]
colors = [colors_dict[s] for s in list(reads.loc[:, 'Body site'].values)]
shapes = [shapes_dict[s] for s in list(reads.loc[:, 'Participant'].values)]

Percentage reads remaining

#5
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Reads kept (%)')
plt.xlim([-0.5,20.5])
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads kept (%)')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()

Number of reads remaining

#6
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.xlim([-0.5,20.5])
plt.ylabel('Reads remaining')

plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads remaining')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()

Table of reads remaining

#7
reads_remain = reads.loc[:, ['Percentage', 'cat_reads']].rename(index=full_name_dict, columns = {'cat_reads':'Number'})
#8
py$reads_remain %>%
  kable() %>%
  kable_styling()
Percentage Number
Huref Blood 0.2319067 1087311
PGPC-0002 Blood 0.5748318 3321664
PGPC-0005 Blood 0.5149286 1903482
PGPC-0006 Blood 0.7146067 3593441
PGPC-0050 Blood 0.6870164 2940390
PGPC-0002 Saliva 3.9289510 16561343
PGPC-0005 Saliva 55.6458826 244058333
PGPC-0006 Saliva 13.1608121 60497393
PGPC-0050 Saliva 56.3700720 257049004
PGPC-0002 Buccal 2.8112226 11462781
PGPC-0005 Buccal 2.8202504 12751614
PGPC-0006 Buccal 5.1521560 22434503
PGPC-0050 Buccal 2.4297107 10667011
PGPC-0002 Saliva_euk 1.2333408 5345888
PGPC-0005 Saliva_euk 8.3104464 38138382
PGPC-0006 Saliva_euk 1.9067564 8786635
PGPC-0050 Saliva_euk 5.6187272 25037552
PGPC-0002 Buccal_euk 0.9385856 4380036
PGPC-0005 Buccal_euk 1.2119820 5584415
PGPC-0006 Buccal_euk 1.5567291 7232280
PGPC-0050 Buccal_euk 1.3836141 6382288

Taxonomic profiling

The taxonomy has been profiled using:
1. HUMAnN
- HUMAnN2 and MetaPhlAn2
- HUMAnN3 and MetaPhlAn3
2. Kraken2 with Bracken
- GTDB (no confidence parameter set) -
using the database constructed using Struo, release 89 - GTDB (confidence = 0.1)
- Minikraken v1 (no human genome, no confidence parameter set)
- Minikraken v1 (no human genome, confidence = 0.1)
- Minikraken v2 (with human genome, no confidence parameter set)
- Minikraken v2 (with human genome, confidence = 0.1)
- RefSeq Complete v93 (no confidence parameter set)
- RefSeq Complete v93 (confidence = 0.1)

Commands run:

humann2_databases --download chocophlan full humann_database/
humann2_databases --download uniref uniref90_diamond humann_database/

# Do this for each of uniref_90 and uniref_50

parallel -j 4 'humann2 --threads 10 --input {} --output humann2_out_90/{/.} --protein-database humann_database/uniref_90/' ::: cat_reads/*fastq

mkdir humann2_log_90
python
import os
samples = os.listdir('humann2_out_90')
for sample in samples:
    os.system('cp humann2_out_90/'+sample+'/'+sample+'_humann2_temp/'+sample+'.log humann2_log_90/')
    
quit()

mkdir humann2_final_out_90

humann2_join_tables -s --input humann2_out_90/ --file_name pathabundance --output humann2_final_out_90/humann2_pathabundance.tsv
humann2_join_tables -s --input humann2_out_90/ --file_name pathcoverage --output humann2_final_out_90/humann2_pathcoverage.tsv
humann2_join_tables -s --input humann2_out_90/ --file_name genefamilies --output humann2_final_out_90/humann2_genefamilies.tsv

mkdir humann2_final_out_90/bugs_lists/

python

import os
samples = os.listdir('humann2_out_90')
for sample in samples:
    os.system('cp humann2_out_90/'+sample+'/'+sample+'_humann2_temp/'+sample+'_metaphlan_bugs_list.tsv humann2_final_out_90/bugs_lists/')

quit()

merge_metaphlan_tables.py humann2_final_out_90/bugs_lists/*tsv > humann2_final_out_90/metaphlan_merged.tsv
rm -r humann2_final_out_90/bugs_lists

cd humann2_final_out_90/
humann2_renorm_table --input humann2_pathabundance.tsv --units relab --output humann2_pathabundance_relab.tsv
humann2_split_stratified_table --input humann2_pathabundance_relab.tsv --output ./

humann2_renorm_table --input humann2_genefamilies.tsv --units relab --output humann2_genefamilies_relab.tsv
humann2_split_stratified_table --input humann2_genefamilies_relab.tsv --output ./
pip install humann
pip install metaphlan

humann_databases --download chocophlan full humann_databases/humann3_databases/ --update-config yes
humann_databases --download uniref uniref90_diamond humann_databases/humann3_databases/ --update-config yes
humann_databases --download uniref uniref50_diamond humann_databases/humann3_databases/ --update-config yes
humann_databases --download utility_mapping full humann_databases/humann3_databases/ --update-config yes

wget https://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
tar xvf diamond-linux64.tar.gz

parallel -j 1 'humann --input {} --output human_metagenome/humann3_out/ --threads 12' ::: human_metagenome/cat_reads/*.fastq

merge_metaphlan_tables.py humann_final_out/bugs_lists/*tsv > humann3_final_out/metaphlan_merged.tsv
rm -r humann3_final_out/bugs_lists

humann_join_tables -s --input humann3_out/ --file_name pathabundance --output humann3_final_out/humann3_pathabundance.tsv
humann_join_tables -s --input humann3_out/ --file_name pathcoverage --output humann3_final_out/humann3_pathcoverage.tsv
humann_join_tables -s --input humann3_out/ --file_name genefamilies --output humann3_final_out/humann3_genefamilies.tsv

humann_renorm_table --input humann3_pathabundance.tsv --units relab --output humann3_pathabundance_relab.tsv
humann_split_stratified_table --input humann3_pathabundance_relab.tsv --output ./

humann_renorm_table --input humann3_genefamilies.tsv --units relab --output humann3_genefamilies_relab.tsv
humann_split_stratified_table --input humann3_genefamilies_relab.tsv --output ./

find . -name \*aligned.sam -type f -delete
find . -name \*unaligned.fa -type f -delete
find . -name \*aligned.tsv -type f -delete

humann_renorm_table --input humann3_genefamilies.tsv --output humann3_genefamilies_cpm.tsv --units cpm --update-snames

humann_regroup_table --input humann3_genefamilies_cpm.tsv --output humann3_genefamilies_cpm_ko_50.tsv --groups uniref50_ko
humann_regroup_table --input humann3_genefamilies_cpm.tsv --output humann3_genefamilies_cpm_ko_90.tsv --groups uniref90_ko
#GTDB
wget http://ftp.tue.mpg.de/ebio/projects/struo/GTDB_release89/kraken2/

#RefSeq complete
sudo mount -t ramfs none /scratch/ramdisk/
sudo cp -a $DBNAME /ramdisk
sudo cp -a /home/shared/Kraken2.0.8_Bracken150mer_RefSeqCompleteV93/ /scratch/ramdisk/


#For each database (with and without adding --confidence 0.1:
parallel -j 1 'kraken2 --use-names --threads 10 --db gtdb/kraken2_struo/ --fastq-input {} --report kraken2_out/{/.}.report > kraken2_out/{/.}.kraken' ::: human_metagenome/cat_reads/*.fastq

parallel -j 1 'bracken -d gtdb/kraken2_struo/ -i {} -l S -o {.}.bracken' ::: kraken2_report/*.kreport

MetaPhlAn2 taxonomy nMDS plots

#9
#get the taxonomy file and sort it to strain and genus level
taxa = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/metaphlan_merged.tsv', sep='\t', header=0, index_col=0)
tax_names = list(taxa.index.values)
keeping = []
for a in range(len(tax_names)):
    if 't__' in tax_names[a]:
        keeping.append(True)
    elif 'unclassified' in tax_names[a]:
        keeping.append(True)
    else:
        keeping.append(False)
strain = taxa.loc[keeping, :]
strain_names = list(strain.index.values)
strain_dict = {}
for i in range(len(strain_names)):
    strain_dict[strain_names[i]] = strain_names[i].split('|s__')[0].split('|g__')[1]
genus = strain.rename(index=strain_dict)
genus = genus.groupby(by=genus.index, axis=0).sum()

strain_t = strain.transpose()
genus_t = genus.transpose()
#10
#define the function that calculates the nmds plots

def transform_for_NMDS(df, dist_met='braycurtis'):
    X = df.iloc[0:].values
    y = df.iloc[:,0].values
    seed = np.random.RandomState(seed=3)
    X_true = X
    similarities = distance.cdist(X_true, X_true, dist_met)
    mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
                   dissimilarity="precomputed", n_jobs=1)
    #print(similarities)
    pos = mds.fit(similarities).embedding_
    nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
                        dissimilarity="precomputed", random_state=seed, n_jobs=1,
                        n_init=1)
    npos = nmds.fit_transform(similarities, init=pos)
    # Rescale the data
    pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
    npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
    # Rotate the data
    clf = PCA()
    X_true = clf.fit_transform(X_true)
    pos = clf.fit_transform(pos)
    npos = clf.fit_transform(npos)
    return pos, npos, nmds.stress_

handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]

Bray-Curtis distance

#11
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'braycurtis')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'braycurtis')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()

Euclidean distance

#12
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'euclidean')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'euclidean')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()

Jaccard distance

#13
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'jaccard')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'jaccard')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()

MetaPhlAn2 taxonomy relative abundance

Kingdom

Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Kingdom level for each sample.

#14
plt.figure(figsize=(7,5))
ax1 = plt.subplot(111)
plt.bar(list(taxa.columns.values), taxa.loc['k__Viruses', :].values, color='#C70039', edgecolor='k')
plt.bar(list(taxa.columns.values), taxa.loc['k__Bacteria', :].values, bottom=taxa.loc['k__Viruses', :].values, color='#026B81', edgecolor='k')
#plt.xticks(list(taxa.columns.values), sample_names, rotation=90)
empty = []
for x in range(0,21):
    empty.append('')
    ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
handles = [Patch(facecolor='#C70039', edgecolor='k', label='Viruses'), Patch(facecolor='#026B81', edgecolor='k', label='Bacteria')]
plt.legend(handles=handles, bbox_to_anchor=(1,1.05))
plt.tight_layout()
plt.show()

Genus

Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Genus level for each sample. Genera with below 1% maximum relative abundance have been removed.

#15
genus = genus[genus.max(axis=1) > 1]
genera = list(genus.index.values)
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
gen_colors = get_cols(len(genus.index.values))
handles = []
for g in range(len(genera)):
    this_gen = genus.loc[genera[g], :].values
    if g == 0:
        ax1.bar(list(genus.columns.values), this_gen, color=gen_colors[g], edgecolor='k')
        total = this_gen
    else:
        ax1.bar(list(genus.columns.values), this_gen, bottom=total, color=gen_colors[g], edgecolor='k')
        total = this_gen+total
    handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=genera[g]))
empty = []
for x in range(0,21):
    empty.append('')
    ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
plt.legend(handles=handles, bbox_to_anchor=(1,1.05), ncol=2)
plt.tight_layout()
plt.show()

Kraken2 reads classified

#16
#get all samples into dataframes based on the database that they use
folders = sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2'))
del folders[0]
kraken_columns = {0:'Percent fragments clade', 1:'Number fragments clade', 2:'Number fragments taxon', 3:'Level', 4:'NCBI ID', 5:'Taxon name'}
kraken_all_db, bracken_all_db, all_domains = [], [], {}
for fol in folders:
    #if fol != 'gtdb_conf': continue
    if fol == 'db_genera' or fol == 'db_genera_in_common':
      continue
    if not os.path.isdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol):
        continue
    bracken, kraken, bracken_kreport = [], [], []
    bracken_pd, kraken_pd = [], []
    for fi in sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol)):
        if fi[-7:] == 'bracken':
            bracken.append(fi)
        elif fi[-7:] == 'kreport' and 'bracken' not in fi:
            kraken.append(fi)
        elif fi[-7:] == 'kreport':
            bracken_kreport.append(fi)
    for bk in bracken_kreport:
        with open('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+bk, 'rU') as f:
            bk = []
            domains = {}
            this_domain, domain_name = [], ''
            for row in csv.reader(f, delimiter='\t'):
                bk.append(row)
                row[5] = row[5].lstrip()
                if row[3] == 'D':
                    if domain_name != '':
                        domains[domain_name] = this_domain
                    this_domain, domain_name = [], row[5]
                else:
                    if row[3] != 'R' and row[3] != 'U' and 'D' not in row[3]:
                        this_domain.append(row[5])
            domains[domain_name] = this_domain
            for domain in domains:
                if domain in all_domains:
                    all_domains[domain] = list(set(all_domains[domain]+domains[domain]))
                else:
                    all_domains[domain] = list(set(domains[domain]))
    for b in bracken:
        if len(b) > 22:
            continue
        sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+b, sep='\t', header=0, index_col=0)
        b = b.replace('_150', '')
        sample.drop(['taxonomy_id', 'taxonomy_lvl', 'kraken_assigned_reads', 'added_reads', 'fraction_total_reads'], axis=1, inplace=True)
        sample.rename(columns={'new_est_reads':b[:-8]}, inplace=True)
        bracken_pd.append(sample)
    for k in kraken:
        sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+k, sep='\t', header=None, index_col=3)
        sample = sample.loc[['U', 'D'], :]
        if sample.iloc[1,1] == 'x':
          sample.iloc[1,1] = sample.iloc[1,2]
        sample = sample.rename(columns=kraken_columns).drop(['Number fragments taxon', 'NCBI ID'], axis=1).rename(columns={'Percent fragments clade':k[:-8]+'_percent', 'Number fragments clade':k[:-8]+'_reads'}).set_index('Taxon name')
        taxa = list(sample.index.values)
        taxa_dict = {}
        for t in taxa:
            taxa_dict[t] = t.replace(' ', '')
        sample = sample.rename(index=taxa_dict)
        kraken_pd.append(sample)
    bracken = pd.concat(bracken_pd, join='outer')
    kraken = pd.concat(kraken_pd, join='outer')
    kraken.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/gtdb_conf_test.csv')
    kraken = kraken.rename(index={'d__Bacteria':'Bacteria', 'd__Archaea':'Archaea'})
    kraken = kraken.groupby(by=kraken.index, axis=0).sum()
    bracken = bracken.groupby(by=bracken.index, axis=0).sum().fillna(value=0)
    kraken_all_db.append(kraken), bracken_all_db.append(bracken)
#17
x1 = [x for x in range(21)]
x2 = [x+0.3 for x in range(21)]
tax_plotting = ['Archaea', 'Bacteria', 'Eukaryota', 'Viruses', 'unclassified']
color_plotting = ['#EDBB99', '#5499C7', '#7DCEA0', '#F7DC6F', '#CCD1D1']
tax_paper = ['Bacteria', 'Eukaryota', 'Other', 'Unclassified']
color_paper = ['#5499C7', '#7DCEA0', '#CD6155', '#CCD1D1']
from_paper = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/from_paper.csv', header=0, index_col=0)

def get_summary_reads(kraken_db):
    fig = plt.figure(figsize=(15,15))
    ax1, ax2, ax3, ax4, ax5 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325)
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    ax5.set_title('From paper')
    ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
    x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
    axs = [ax1, ax2, ax3, ax4, ax5]
    for s in range(len(samples)):
        if s == 0:
            continue
        bottom = 0
        for t in range(len(tax_paper)):
            ax5.bar(x1[s], from_paper.loc[tax_paper[t], samples[s]], bottom=bottom, color=color_paper[t], edgecolor='k', width=0.6)
            bottom += from_paper.loc[tax_paper[t], samples[s]]
    for db in range(len(kraken_db)):
        ax_using = ax_plot[db]
        x = x_plot[db]
        db = kraken_db[db]
        handles = []
        for tax in range(len(tax_plotting)):
            handles.append(Patch(facecolor=color_plotting[tax], edgecolor='k', label=tax_plotting[tax]))
            tax = tax_plotting[tax]
            if tax not in list(db.index.values):
                db.loc[tax] = [0 for i in range(db.shape[1])]
        handles.append(Patch(facecolor=color_paper[2], edgecolor='k', label='Other'))
        db = db.fillna(value=0)
        for s in range(len(samples)):
            bottom = 0
            for t in range(len(tax_plotting)):
                prop = db.loc[tax_plotting[t], samples[s]+'_reads']
                cat = total_reads.loc[samples[s], 'cat_reads']
                prop = (prop/cat)*100
                ax_using.bar(x[s], prop, bottom=bottom, color=color_plotting[t], edgecolor='k', width=0.3)
                bottom += prop
    ax2.legend(handles=handles, bbox_to_anchor=(1,1.05))
    for ax in axs:
        plt.sca(ax)
        plt.xticks(x1, ['' for x in x1])
        plt.ylim([0, 100])
        plt.xlim([-0.5, 20.5])
    for x in x1:
        ax5.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
        ax4.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
    ax1.set_ylabel('Classified (%)'), ax3.set_ylabel('Classified (%)'), ax5.set_ylabel('Classified(%)')
    #plt.tight_layout()
    return

def get_summary_bacteria(kraken_db):
    tax_plotting = ['Bacteria']
    alpha = ['#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F']
    
    fig = plt.figure(figsize=(10,8))
    ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
    x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
    axs = [ax1, ax2, ax3, ax4]
    
    for db in range(len(kraken_db)):
        ax_using = ax_plot[db]
        x = x_plot[db]
        alp = alpha[db]
        db = kraken_db[db]
        for tax in range(len(tax_plotting)):
            tax = tax_plotting[tax]
            if tax not in list(db.index.values):
                db.loc[tax] = [0 for i in range(db.shape[1])]
        db = db.fillna(value=0)
        for s in range(len(samples)):
            bottom = 0
            for t in range(len(tax_plotting)):
                prop = db.loc[tax_plotting[t], samples[s]+'_reads']
                cat = total_reads.loc[samples[s], 'cat_reads']
                ax_using.bar(x[s], prop, bottom=bottom, color=alp, edgecolor='k', width=0.3)
                bottom += prop
    handles = []
    handles.append(Patch(facecolor=alpha[0], edgecolor='k', label='No confidence value'))
    handles.append(Patch(facecolor=alpha[1], edgecolor='k', label='Confidence=0.1'))
    ax2.legend(handles=handles, bbox_to_anchor=(1.6,1.03))
    for ax in axs:
        plt.sca(ax)
        plt.xticks(x1, ['' for x in x1])
        plt.semilogy()
        plt.xlim([-0.5, 20.5])
        #plt.ylim([0, 100])
        plt.xlim([-0.5, 20.5])
    for x in x1:
        pl = ((1/21)*(x+1))-0.02
        ax3.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax3.transAxes)
        ax4.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax4.transAxes)
    ax1.set_ylabel('Number of reads'), ax3.set_ylabel('Number of reads')
    #plt.tight_layout()
    return

Percent classified

A summary of the percentage of reads classified as different domains with different databases. Note that the ‘From paper’ plot uses the classifications given in the original paper, where 10,000 unmapped reads were classified using BLAST searches of the NCBI database.

#18
get_summary_reads(kraken_all_db)
plt.show()

Summary of number of bacteria

Summary of the number of reads that are classified as bacteria by each database.

#19
get_summary_bacteria(kraken_all_db)
plt.tight_layout()
plt.show()

Kraken2 taxonomy nMDS plots

These first plots are all separately with the confidence parameter set. See the last tab for those without the confidence parameter set.

#20
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []

for db in range(len(bracken_all_db)):
    d = int(db)
    db = bracken_all_db[db]
    species = list(db.index.values)
    keeping = []
    species_dict = {}
    for sp in species:
        if sp in bacteria:
            keeping.append(True)
            new_sp = sp.split('__')
            if len(new_sp) > 1:
                new_sp = new_sp[1]
            else:
                new_sp = new_sp[0]
            species_dict[sp] = new_sp.split(' ')[0].replace("'", '')
        else:
            keeping.append(False)
    in_len = db.shape[0]
    db = db.loc[keeping, :]
    strain.append(db)
    db = db.rename(index=species_dict)
    db = db.groupby(by=db.index, axis=0).sum()
    sums = db.sum(axis=0)
    gen_sums.append(sums)
    db = db.divide(sums, axis=1).multiply(100)
    genera.append(db)
    db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/'+db_names[d]+'.csv')
    gen_names = gen_names+list(db.index.values)
    db = db[db.max(axis=1) > 1]
    genera_1.append(db)
    gen_names_1 = gen_names_1+list(db.index.values)
gen_names = list(set(gen_names))
gen_names_1 = list(set(gen_names_1))
#21
def plot_four_nmds(dbs, metric, name):
    fig = plt.figure(figsize=(15,10))
    #fig.suptitle(name+metric+'\n\n\n')
    ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
    axs = [ax3, ax1, ax2, ax4]
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    
    for db in range(len(dbs)):
        n = db
        db = dbs[db].transpose()
        pos, npos, stress = transform_for_NMDS(db, metric)
        for a in range(len(npos)):
            axs[n].scatter(npos[a,0], npos[a,1], marker=shapes[a], color=colors[a], s=100, edgecolor='k')
        axs[n].set_xlabel('nMDS1')
        axs[n].set_ylabel('nMDS2')
    handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
    handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]

    ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
    plt.tight_layout()
    return

Bray-Curtis distance strain level

#22
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'braycurtis', 'NMDS confidence=0.1 strain ')
plt.show()

Bray-Curtis distance genus level

#23
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'braycurtis', 'NMDS confidence=0.1 genera ')
plt.show()

Euclidean distance strain level

#24
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'euclidean', 'NMDS confidence=0.1 strain ')
plt.show()

Euclidean distance genus level

#25
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'euclidean', 'NMDS confidence=0.1 genera ')
plt.show()

Jaccard distance strain level

#26
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'jaccard', 'NMDS confidence=0.1 strain ')
plt.show()

Jaccard distance genus level

#27
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'jaccard', 'NMDS confidence=0.1 genera ')
plt.show()

All plots with no confidence parameter set

Bray-curtis distance at strain level

#28
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'braycurtis', 'NMDS no confidence strain ')
plt.show()

Bray-curtis distance at genus level

#29
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'braycurtis', 'NMDS no confidence genera ')
plt.show()

Euclidean distance at strain level

#30
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'euclidean', 'NMDS no confidence strain ')
plt.show()

Euclidean distance at genus level

#31
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'euclidean', 'NMDS no confidence genera ')
plt.show()

Jaccard distance at strain level

#32
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'jaccard', 'NMDS no confidence strain ')
plt.show()

Jaccard distance at genus level

#33
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'jaccard', 'NMDS no confidence genera ')
plt.show()

Kraken2 taxonomy relative abundance

These plots are now only calculated for the classifications that used confidence = 0.1. Genera with below 1% maximum relative abundance are removed and the numbers in brackets are the number of reads that were classified as bacteria.

#34
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
#bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
#genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
gen_names_1 = sorted(gen_names_1)
def plot_genera(db, sums, tax_cols, gen_names_1, dname):
    plt.figure(figsize=(10,5))
    ax1 = plt.subplot(111)
    bottom = [0 for x in range(len(db.columns))]
    handles = []
    for g in range(len(gen_names_1)):
        if gen_names_1[g] in db.index.values:
            this_row = db.loc[gen_names_1[g], :].values
            ax1.bar(x1, this_row, bottom=bottom, color=tax_cols[g], edgecolor='k')
            handles.append(Patch(facecolor=tax_cols[g], edgecolor='k', label=gen_names_1[g]))
            for b in range(len(bottom)):
                bottom[b] += this_row[b]
    ax1.legend(handles=handles, bbox_to_anchor=(1, 1.03), ncol=3)
    plt.xticks(x1, ['' for x in x1])
    plt.ylabel('Relative abundance(%)')
    plt.xlim([-0.5, 20.5])
    plt.ylim([0, 100])
    for x in x1:
        n = str(int(sums[samples[x]]))
        ax1.text(x, -2, sample_names[x]+' ('+n+')', color=colors[x], rotation=90, va='top', ha='center')
    plt.tight_layout()
    return

gen_plot = [genera_1[1], genera_1[3], genera_1[5], genera_1[7]]
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
all_sums = [gen_sums[1], gen_sums[3], gen_sums[5], gen_sums[7]]
tax_cols = get_cols(len(gen_names_1))

Minikraken v1

#35
plot_genera(gen_plot[1], all_sums[1], tax_cols, gen_names_1, db_name[1])
plt.show()

Minikraken v2

#36
plot_genera(gen_plot[2], all_sums[2], tax_cols, gen_names_1, db_name[2])
plt.show()

GTDB

#37
plot_genera(gen_plot[0], all_sums[0], tax_cols, gen_names_1, db_name[0])
plt.show()

RefSeq Complete v93

#38
plot_genera(gen_plot[3], all_sums[3], tax_cols, gen_names_1, db_name[3])
plt.show()

Kraken2 similarities and differences between body sites

Here I have carried out ANCOM2 tests for differential abundance of genera between body sites. All genera are then plotted on a heatmap with mean values for each body site and stars to denote significant differences as determined by ANCOM.

While many of these results vary between databases, it is worth noting that some differences are consistent, e.g.:
1. Prevotella are always more abundant in saliva samples
2. Streptococcus are always more abundant in buccal samples
3. Klebsiella (where present) are always more abundant in blood samples

#39
def get_differences(new_genus, db_name):
    new_genus = new_genus.drop(['SRR8595488'], axis=1)
    new_genus = new_genus[new_genus.max(axis=1) > 0.1]
    samples = list(new_genus.columns)
    
    list_comparisons = [['Blood', 'Saliva'], ['Blood', 'Buccal'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk'], ['Saliva_euk', 'Buccal_euk']]
    comparisons, metadata, comp_len = [], [], []
    for a in range(len(list_comparisons)):
        keeping = []
        this_md = []
        for b in range(len(samples)):
            if site_dict[samples[b]] in list_comparisons[a]:
                keeping.append(True)
                this_md.append([samples[b], site_dict[samples[b]]])
            else:
                keeping.append(False)
        this_comp = new_genus.loc[:, keeping]
        this_comp = this_comp[this_comp.max(axis=1) > 0.1]
        comparisons.append(this_comp)
        this_md = pd.DataFrame(this_md, columns=['Samples', 'Groups'])
        #this_md.set_index('Samples', inplace=True)
        metadata.append(this_md)
        comp_len.append(this_comp.shape[0])
    new_genus = new_genus.rename(columns=site_dict, inplace=False)
    return comparisons, metadata, new_genus, comp_len
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
comparison_names = [r'Blood vs saliva', r'Blood vs buccal', r'Saliva vs buccal', r'Saliva vs saliva_euk', r'Buccal vs buccal_euk', r'Blood vs saliva_euk', r'Blood vs buccal_euk', r'Saliva_euk vs buccal_euk']
source("/Users/robynwright/Documents/OneDrive/Github/R-notebooks/ancom_v2.1.R")

get_ancom <- function(ft, md) {
all_ancom = list()
for (a in 1:8){
  feature_table = ft[a]
  meta_data = md[a]
  process = feature_table_pre_process(feature_table, meta_data, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
  ancom_out = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
  all_ancom[[a]] <- ancom_out$out
}
return(all_ancom)
}
def sort_ancom_results(r_ancom):
  ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06 = [], [], [], []
  for a in range(len(r_ancom)):
    this_sig_09, this_sig_08, this_sig_07, this_sig_06 = [], [], [], []
    r_ancom[a].set_index('taxa_id', inplace=True)
    all_sp = list(r_ancom[a].index.values)
    for b in range(len(all_sp)):
      if r_ancom[a].loc[all_sp[b], 'detected_0.9'] == True:
        this_sig_09.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.8'] == True:
        this_sig_08.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.7'] == True:
        this_sig_07.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.6'] == True:
        this_sig_06.append(all_sp[b])
    ancom_lists_09.append(this_sig_09), ancom_lists_08.append(this_sig_08), ancom_lists_07.append(this_sig_07), ancom_lists_06.append(this_sig_06)
  return [ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06]
def plot_heatmap(new_genus, ANCOM):
    print(ANCOM)
    names = ['Blood', 'Saliva', 'Buccal', 'Saliva_euk', 'Buccal_euk']
    other_names = ['Abundance', r'Blood $vs$ saliva', r'Blood $vs$ buccal', r'Saliva $vs$ buccal', r'Saliva $vs$ saliva_euk', r'Buccal $vs$ buccal_euk', r'Blood $vs$ saliva_euk', r'Blood $vs$ buccal_euk', r'Saliva_euk $vs$ buccal_euk']
    colormap, norm = mpl.cm.get_cmap('plasma', 256), mpl.colors.Normalize(vmin=0, vmax=1)
    m = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
    if list(new_genus.index.values)[0][0] == 'A':
        new_genus = new_genus.iloc[::-1]
    figure = plt.figure(figsize=(5,new_genus.shape[0]*0.2))
    ax1 = plt.subplot(111)
    genus_names = list(new_genus.index.values)
    y = []
    for g in range(len(genus_names)):
        this_row = new_genus.loc[genus_names[g], :]
        values = [(np.mean(this_row[name].values)) for name in names]
        bottom, top, x = [g for a in range(5)], [1 for a in range(5)], [a for a in range(5)]
        y.append(g+0.5)
        ma = max(values)
        values = [v/ma for v in values]
        values = [m.to_rgba(v) for v in values]
        ax1.bar(x, top, bottom=bottom, color=values, edgecolor='k', width=1)
        x.append(5)
        ax1.scatter(x[-1], bottom[-1]+0.5, color='k', s=ma*5)
        sig, x_plt = [], x[-1]
        for a in range(len(ANCOM)):
            x_plt += 0.75
            if genus_names[g] in ANCOM[a]: ax1.scatter(x_plt, bottom[-1]+0.5, marker='*', color='k')
            x.append(x_plt)
    plt.ylim([0, len(genus_names)]), plt.xlim([-0.5, x[-1]+0.5])
    plt.yticks(y, genus_names)
    plt.xticks(x, names+other_names, rotation=90)
    ax1.xaxis.set_ticks_position('top')
    plt.tight_layout()
    plt.show()
    return

Minikraken v1 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 41 41 37 26 8
Blood vs buccal 37 27 21 15 3
Saliva vs buccal 45 3 3 2 1
Saliva vs saliva_euk 41 0 0 0 0
Buccal vs buccal_euk 39 2 0 0 0
Blood vs saliva_euk 38 32 24 15 3
Blood vs buccal_euk 34 11 5 2 2
Saliva_euk vs buccal_euk 43 5 3 2 1

Minikraken v1 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])

Minikraken v2 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_human_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 49 48 47 38 9
Blood vs buccal 48 35 16 14 4
Saliva vs buccal 39 5 4 1 0
Saliva vs saliva_euk 30 0 0 0 0
Buccal vs buccal_euk 37 0 0 0 0
Blood vs saliva_euk 49 47 33 22 5
Blood vs buccal_euk 50 19 14 12 4
Saliva_euk vs buccal_euk 42 4 3 3 2

Minikraken v2 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])

GTDB ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/gtdb_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 98 88 82 32 6
Blood vs buccal 91 21 14 8 5
Saliva vs buccal 96 18 10 2 1
Saliva vs saliva_euk 93 3 2 0 0
Buccal vs buccal_euk 95 8 5 1 0
Blood vs saliva_euk 92 27 17 11 4
Blood vs buccal_euk 75 7 7 3 2
Saliva_euk vs buccal_euk 97 19 13 7 1

GTDB heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])

RefSeq Complete v93 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/refseq_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 85 81 77 35 11
Blood vs buccal 82 65 36 21 7
Saliva vs buccal 69 9 6 3 0
Saliva vs saliva_euk 64 0 0 0 0
Buccal vs buccal_euk 76 4 4 1 0
Blood vs saliva_euk 86 72 37 20 4
Blood vs buccal_euk 79 18 15 8 2
Saliva_euk vs buccal_euk 84 12 10 5 1

RefSeq Complete v93 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])

Kraken2 intersection between databases

Now we are looking at the taxa (grouped to genus level, to hopefully allow for differences between databases) that are in common between the different databases that have been used with Kraken2. We are also using only the versions that have confidence=0.1.

Genera common to all databases (reads before and after)

The plots below here show the number of reads in each sample before and after removal of the genera that aren’t present in all databases.

#print(bracken_all_db)
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3

conf_bracken_genus = []
genus_in_each = []
for a in range(len(bracken_all_db)):
  if a not in [0, 2, 4, 6]:
    continue
  this_db = pd.DataFrame(bracken_all_db[a])
  taxa = list(this_db.index.values)
  tax_dict = {}
  for b in range(len(taxa)):
    orig_tax = taxa[b]
    if 's__' in taxa[b]:
      taxa[b] = taxa[b].replace('s__', '')
    taxa[b] = taxa[b].split(' ')[0]
    taxa[b] = taxa[b].replace("'", "")
    tax_dict[orig_tax] = taxa[b]
  this_db.rename(index=tax_dict, inplace=True)
  genus = this_db.groupby(by=this_db.index, axis=0).sum()
  conf_bracken_genus.append(genus)
  genus_in_each.append(list(genus.index.values))

[print(len(gen)) for gen in genus_in_each]
overall_genus = intersection(genus_in_each[0], genus_in_each[1])
overall_genus = intersection(overall_genus, genus_in_each[2])
overall_genus = intersection(overall_genus, genus_in_each[3])
print(len(overall_genus))
fig = plt.figure(figsize=(10,6))
ax = [plt.subplot(223), plt.subplot(221), plt.subplot(222), plt.subplot(224)]
x1 = [x for x in range(21)]
x2 = [x+0.4 for x in range(21)]
x3 = [x+0.2 for x in range(21)]

conf_bracken_overall = []
names = ['gtdb', 'minikrakenv1', 'minikrakenv2', 'refseq']
labels = ['GTDB', 'Minikraken v1 (without human)', 'Minikraken v2 (with human)', 'RefSeq Complete v93']
sum_reduced = []
for db in range(len(conf_bracken_genus)):
  new_db = pd.DataFrame(conf_bracken_genus[db].loc[overall_genus, :])
  conf_bracken_overall.append(new_db)
  new_db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera_in_common/'+names[db]+'_common.csv')
  conf_bracken_genus[db].to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera_in_common/'+names[db]+'.csv')
  sums = new_db.sum(axis=0)
  sum_reduced.append(sums)
  ax[db].bar(x2, sums, color=colors, edgecolor='k', width=0.4, alpha=0.5)
  sums = conf_bracken_genus[db].sum(axis=0)
  ax[db].bar(x1, sums, color=colors, edgecolor='k', width=0.4)
  ax[db].semilogy()
  ax[db].set_title(labels[db])
  plt.sca(ax[db])
  if db == 1 or db == 2:
    plt.xticks([])
  else:
    plt.xticks(x3, sample_names, rotation=90)
ax[0].set_ylabel('Number of reads'), ax[1].set_ylabel('Number of reads')
handles = [Patch(facecolor='k', edgecolor='k', label='All genera'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Genera present in all')]
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()

Genera common to all databases (reads remaining)

fig = plt.figure(figsize=(8,4))
ax = plt.subplot(111)
x1 = [x*5 for x in range(21)]
x2 = [x+1 for x in x1]
x3 = [x+1 for x in x2]
x4 = [x+1 for x in x3]
xplt = [x+0.5 for x in x2]

x = [x3, x1, x2, x4]
al = [1, 0.8, 0.6, 0.4]
for db in range(len(sum_reduced)):
  ax.bar(x[db], sum_reduced[db], color=colors, width=1, edgecolor='k', alpha=al[db])
handles = [Patch(facecolor='k', edgecolor='k', label='Minikraken v1'), Patch(facecolor='k', edgecolor='k', alpha=0.8, label='Minikraken v2'), Patch(facecolor='k', edgecolor='k', alpha=0.6, label='GTDB'), Patch(facecolor='k', edgecolor='k', alpha=0.4, label='RefSeq Complete v93')]
ax.legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1))
plt.xticks(xplt, sample_names, rotation=90)
plt.semilogy()
plt.ylabel('Number of reads')
plt.tight_layout()
plt.show()

Genera stacked bars (absolute abundance)

for db in range(len(conf_bracken_overall)):
  rename_samples = {}
  for sample in list(conf_bracken_overall[db].columns):
    rename_samples[sample] = sample+'_'+names[db]
  this_db = pd.DataFrame(conf_bracken_overall[db].rename(columns=rename_samples))
  if db == 0:
    overall_genus = this_db
  else:
    overall_genus = pd.concat([overall_genus, this_db], axis=1)
overall_genus = pd.DataFrame(overall_genus)
#overall_genus = pd.DataFrame(overall_genus.divide(overall_genus.sum(axis=0)).multiply(100))
overall_genus = overall_genus[overall_genus.max(axis=1) > 25000]
sums = overall_genus.sum(axis=0)
gen_colors = get_cols(overall_genus.shape[0])
print(overall_genus)
fig = plt.figure(figsize=(14,10))
ax1 = plt.subplot(234)
ax = [ax1, plt.subplot(231, sharey=ax1), plt.subplot(232, sharey=ax1), plt.subplot(235, sharey=ax1)]
ax[1].set_ylabel('Number of reads')
ax[0].set_ylabel('Number of reads')
x1 = [x for x in range(21)]
a = 0
col_samples = list(overall_genus.columns)
#line 20
new_db = []
for name in names:
  keeping = []
  for s in list(overall_genus.columns):
    if name in s:
      keeping.append(True)
    else:
      keeping.append(False)
  other_new_db = pd.DataFrame(overall_genus.loc[:, keeping])
  new_db.append(other_new_db)
for db in range(len(new_db)):
  this_genera = list(new_db[db].index.values)
  handles = []
  for g in range(len(this_genera)):
    if g == 0:
      bottom = [0 for c in x1]
    these_values = new_db[db].loc[this_genera[g], :].values
    ax[db].bar(x1, these_values, bottom=bottom, color=gen_colors[g], edgecolor='k', width=1)
    handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=this_genera[g]))
    bottom = [bottom[x]+these_values[x] for x in range(len(bottom))]
  plt.sca(ax[db])
  if db == 1 or db == 2:
    plt.xticks([])
  else:
    plt.xticks(x1, sample_names, rotation=90)
  plt.semilogy()
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1), ncol=3)
#plt.tight_layout()
plt.show()

Genera stacked bars (relative abundance)

for db in range(len(conf_bracken_overall)):
  rename_samples = {}
  for sample in list(conf_bracken_overall[db].columns):
    rename_samples[sample] = sample+'_'+names[db]
  this_db = pd.DataFrame(conf_bracken_overall[db].rename(columns=rename_samples))
  if db == 0:
    overall_genus = this_db
  else:
    overall_genus = pd.concat([overall_genus, this_db], axis=1)
overall_genus = pd.DataFrame(overall_genus)
overall_genus = pd.DataFrame(overall_genus.divide(overall_genus.sum(axis=0)).multiply(100))
overall_genus = overall_genus[overall_genus.max(axis=1) > 1]
sums = overall_genus.sum(axis=0)
gen_colors = get_cols(overall_genus.shape[0])
print(overall_genus)
fig = plt.figure(figsize=(14,10))
ax1 = plt.subplot(234)
ax = [ax1, plt.subplot(231, sharey=ax1), plt.subplot(232, sharey=ax1), plt.subplot(235, sharey=ax1)]
x1 = [x for x in range(21)]
a = 0
col_samples = list(overall_genus.columns)
#line 20
new_db = []
for name in names:
  keeping = []
  for s in list(overall_genus.columns):
    if name in s:
      keeping.append(True)
    else:
      keeping.append(False)
  other_new_db = pd.DataFrame(overall_genus.loc[:, keeping])
  new_db.append(other_new_db)
for db in range(len(new_db)):
  this_genera = list(new_db[db].index.values)
  handles = []
  for g in range(len(this_genera)):
    if g == 0:
      bottom = [0 for c in x1]
    these_values = new_db[db].loc[this_genera[g], :].values
    ax[db].bar(x1, these_values, bottom=bottom, color=gen_colors[g], edgecolor='k', width=1)
    handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=this_genera[g]))
    bottom = [bottom[x]+these_values[x] for x in range(len(bottom))]
  plt.sca(ax[db])
  if db == 1 or db == 2:
    plt.xticks([])
  else:
    plt.xticks(x1, sample_names, rotation=90)
ax[2].legend(handles=handles, loc='upper left', bbox_to_anchor=(1,1), ncol=3)
plt.tight_layout()
plt.show()

Functional profiling

Function is profiled using (using the commands given above):
1. HUMAnN2
- Uniref90 database
- Uniref50 database
2. HUMAnN3
- Uniref90 and Uniref50
- HUMAnN3 runs both Uniref50 and Uniref90 at the same time, where these needed to be run separately for HUMAnN2

HUMAnN2 unaligned reads

plt.figure(figsize=(8,6))
ax1 = plt.subplot(111)
#unaligned_after_translation_uniref_90
#unaligned_after_translation_uniref_50
x1 = [x for x in range(21)]
x2 = [x+0.4 for x in range(21)]
x = [x+0.2 for x in range(21)]

ax1.bar(x1, reads.loc[:, 'unaligned_after_translation_uniref_90'].values, color=colors, edgecolor='k', width=0.4)
ax1.bar(x2, reads.loc[:, 'unaligned_after_translation_uniref_50'].values, color=colors, edgecolor='k', width=0.4, alpha=0.5)
plt.xticks(x, sample_names, rotation=90)
plt.ylabel('Unaligned reads after translation (%)')
plt.xlim([-0.5,21])
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
ax1.legend(handles=handles, bbox_to_anchor=(1,1))

plt.tight_layout()
plt.show()


## HUMAnN2 richness {.tabset}

After running HUMAnN2, the pathabundance, pathcoverage and genefamilies tables are joined, and the pathabundance is renormalised (relative abundance calculated) and split to tables that are stratified by species or not. Here we use the unstratified pathabundance file (the MetaPhlAn2 species results don’t seem to have been particularly accurate). The genefamilies file is stratified by default so we take only the overall values for the gene family (removing all species results) and re-calculate these values as relative abundances.

Gene family richness

def get_richness(df):
  snames = list(df.columns)
  num_genes = []
  for sn in snames:
    one_sample = pd.DataFrame(df.loc[:, sn])
    one_sample = one_sample[one_sample.max(axis=1) > 0]
    num_genes.append(one_sample.shape[0])
  return snames, num_genes

genes_90 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/humann2_genefamilies.tsv', header=0, index_col=0, sep='\t')
genes_90.drop('UNMAPPED', axis=0, inplace=True)
gene_names = list(genes_90.index.values)
keeping = []
for gn in gene_names:
    new_gn = gn.split('|')[0]
    if gn == new_gn: keeping.append(True)
    else: keeping.append(False)
genes_90_abs = genes_90.loc[keeping, :]
genes_90 = genes_90_abs.divide(genes_90_abs.sum(axis=0)).multiply(100)
pathways_90 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/humann2_pathabundance_relab_unstratified.tsv', header=0, index_col=0, sep='\t')

snames_genes_90, genes_90_richness = get_richness(genes_90)
snames_pways_90, pways_90_richness = get_richness(pathways_90)

genes_50 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_50/humann2_genefamilies.tsv', header=0, index_col=0, sep='\t')
genes_50.drop('UNMAPPED', axis=0, inplace=True)
gene_names = list(genes_50.index.values)
keeping = []
for gn in gene_names:
    new_gn = gn.split('|')[0]
    if gn == new_gn: keeping.append(True)
    else: keeping.append(False)
genes_50_abs = genes_50.loc[keeping, :]
genes_50 = genes_50_abs.divide(genes_50_abs.sum(axis=0)).multiply(100)
pathways_50 = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_50/humann2_pathabundance_relab_unstratified.tsv', header=0, index_col=0, sep='\t')

snames_genes_50, genes_50_richness = get_richness(genes_50)
snames_pways_50, pways_50_richness = get_richness(pathways_50)
plt.figure(figsize=(6,6))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

x = [a for a in range(len(genes_90.columns))]
x1 = [a+0.4 for a in x]

ax1.bar(x, genes_90_richness, width=0.4, color=colors, edgecolor='k')
ax2.bar(x, genes_90_richness, width=0.4, color=colors, edgecolor='k')
ax1.bar(x1[:len(genes_50_richness)], genes_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
ax2.bar(x1[:len(genes_50_richness)], genes_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
plt.sca(ax1)
plt.xlim([-0.5,21]), plt.xticks([])
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
plt.legend(handles=handles, bbox_to_anchor=(1,1))
plt.ylabel('Gene family\nrichness')

plt.sca(ax2)
plt.semilogy()
plt.xlim([-0.5,21])
plt.ylabel('Gene family\nrichness (log scale)')
plt.xticks(x, sample_names, rotation=90)
plt.tight_layout()
plt.show()

Pathway richness

#54
plt.figure(figsize=(6,6))
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

x = [a for a in range(len(genes_90.columns))]
x1 = [a+0.4 for a in x]

ax1.bar(x, pways_90_richness, width=0.4, color=colors, edgecolor='k')
ax1.bar(x1, pways_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
ax2.bar(x, pways_90_richness, width=0.4, color=colors, edgecolor='k')
ax2.bar(x1, pways_50_richness, width=0.4, color=colors, edgecolor='k', alpha=0.5)
plt.sca(ax1)
plt.xticks([])
plt.ylabel('Pathway richness')
handles = [Patch(facecolor='k', edgecolor='k', label='Uniref90'), Patch(facecolor='k', edgecolor='k', alpha=0.5, label='Uniref50')]
plt.legend(handles=handles, bbox_to_anchor=(1,1))
plt.xlim([-0.5,21])
plt.sca(ax2)
plt.semilogy()
plt.xlim([-0.5,21]), plt.xticks(x, sample_names, rotation=90)
plt.ylabel('Pathway richness (log scale)')
plt.tight_layout()
plt.show()

HUMAnN2 differential abundance between body sites

Now we are looking at the gene families and pathways that are differentially abundant between body sites, using the lists that are output by HUMAnN2.

Number of differentially abundant pathways between body sites (Uniref90)

Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.

def get_diff_abundant(genes, name_csv):
    col_names = list(genes.columns)
    col_names_dict = {}
    for cn in col_names:
        if 'SRR8595488' in cn:
          genes.drop([cn], axis=1, inplace=True)
        cn_new = cn.split('_')[0]
        col_names_dict[cn] = site_dict[cn_new]
    new_genes = genes.rename(columns=col_names_dict)
    comparisons = [['Saliva', 'Blood'], ['Buccal', 'Blood'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk']]
    fig = plt.figure(figsize=(15,8))
    ax = [plt.subplot2grid((2,8), (0,1), colspan=2), plt.subplot2grid((2,8), (0,3), colspan=2), plt.subplot2grid((2,8), (0,5), colspan=2), plt.subplot2grid((2,8), (1,0), colspan=2), plt.subplot2grid((2,8), (1,2), colspan=2), plt.subplot2grid((2,8), (1,4), colspan=2), plt.subplot2grid((2,8), (1,6), colspan=2)]
    for a in range(len(comparisons)):
        ax[a].set_title(comparisons[a][0]+r' $vs$ '+comparisons[a][1])
        this_comparison = new_genes.loc[:, comparisons[a]]
        this_comparison = this_comparison[this_comparison.max(axis=1) > 0]
        genes_comp = list(this_comparison.index.values)
        fc, pval, colors_p, this_fc = [], [], [], []
        for b in range(len(genes_comp)):
            c1, c2 = list(this_comparison.loc[genes_comp[b], comparisons[a][0]]), list(this_comparison.loc[genes_comp[b], comparisons[a][1]])
            
            c1, c2 = [0.000001 if x == 0 else x for x in c1], [0.000001 if x == 0 else x for x in c2]
            c1, c2 = [math.log2(c1[c]) for c in range(len(c1))], [math.log2(c2[c]) for c in range(len(c2))]
            t, p = stats.ttest_ind(c1, c2)
            c1, c2 = np.median(c1), np.median(c2)
            diff = c1/c2
            if diff < 1 and diff > 0: diff = -(1/diff)
            fc.append(diff), pval.append(p)
            this_fc.append([genes_comp[b], diff, p])
        colors_p_all = [colors_dict[comparisons[a][0]], colors_dict[comparisons[a][1]]]
        for c in range(len(fc)):
            if fc[c] >= 2 and pval[c] <= 0.05: colors_p.append(colors_p_all[1])
            elif fc[c] <= -2 and pval[c] <= 0.05: colors_p.append(colors_p_all[0])
            else: colors_p.append('#B1B1B1')
            pval[c] = -math.log10(pval[c])
        com = comparisons[a][0]+'_'+comparisons[a][1]
        if a == 0:
          df_pval = pd.DataFrame(this_fc, columns=['Genes/pathways', com+'_FC', com+'_p'])
          df_pval.set_index('Genes/pathways', inplace=True)
        else:
          new_df = pd.DataFrame(this_fc, columns=['Genes/pathways', com+'_FC', com+'_p'])
          new_df.set_index('Genes/pathways', inplace=True)
          df_pval = pd.concat([df_pval, new_df], axis=1, join='outer')
        ma = max([max(fc), abs(min(fc))])+0.5
        if ma > 10: ma = 10.5
        ax[a].set_xlim([-ma, ma])
        ax[a].scatter(fc, pval, marker='o', c=colors_p, s=10)
        ax[a].set_xlabel(r'Log$_2$ fold change')
        if a == 0 or a == 3:
          ax[a].set_ylabel('-log(p value)')
        plt.sca(ax[a])
    df_pval.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/'+name_csv+'.csv')
    handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
    ax[2].legend(handles=handles, bbox_to_anchor=(1.04,1), loc='upper left')
    return
pways = pd.DataFrame(pathways_90)
get_diff_abundant(pways, 'pathways_90')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

Number of differentially abundant pathways between body sites (Uniref50)

Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.

pways = pd.DataFrame(pathways_50)
get_diff_abundant(pways, 'pathways_50')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

Number of differentially abundant gene families between body sites (Uniref90)

Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.

pways = pd.DataFrame(genes_90)
get_diff_abundant(pways, 'genes_90')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()

Number of differentially abundant gene families between body sites (Uniref50)

Volcano plot showing the number of differentially abundant pathways between body sites. Colored points denote genes that are significantly differentially abundant (T-test p<0.05 and log2 (median) FC > 2 or < -2) and they are colored by which body site they are highest in abundance in. Note that very large fold changes are not shown. Initial results showed that there were very few genes with > 10 or < -10, so the maximum plotted is this.

pways = pd.DataFrame(genes_50)
get_diff_abundant(pways, 'genes_50')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()